/*************************************************************************************************************
This ado-file loads QCEW data from https://data.bls.gov/cew/datatoc.htm.
Required:
naics or sic classification
Options:
year0: first year to extract data
year1: last year to extract data
latestyear: latest year of data available
latestquarter: last quarter available for latest year
newrelease: extracts data for latest and prior year from the new release page
state: collapse to state level
national: collapse to national level
fips: keep only specified fips (maximum 10)
agglvl: keep only specified aggregation levels (see BLS documentation)
industry: keep only specified industries
vintage: specify a vintage captured by the way back machine. Example: 20160409031503
saving: location to save data set

Examples: 
Download NAICS data, by state and industry, for 1990-2017:
loadqcew naics, year0(1990) year1(2017) state saving(myfolder/qcewdata)
Download NAICS 4 digit data, by county, for 1990-2017 using annual files:
loadqcew naics, year0(1990) year1(2017) agglvl(70 71 76) frequency(annual) saving(myfolder/qcewdata)
Download NAICS by size class:
loadqcew naics, year0(1990) year1(2019) sizeclass saving(myfolder/qcewdata)

Written by Gabriel Chodorow-Reich and provided without warranty.
This version: September 2018.

*************************************************************************************************************/

capture program drop loadqcew
program define loadqcew
version 12.1
syntax name(name=classification id="naics or sic"), year0(integer) year1(integer) [latestyear(integer 0) latestquarter(integer 4) newrelease frequency(string) state national fips(numlist) agglvl(numlist) industry(string) sizeclass vintage(string) saving(string)]
#delimit;
set more off;

/*************************************************************************************************************
0. Preliminaries
*************************************************************************************************************/
if `"`agglvl'"' != `""' local keep `"keep if inlist(agglvl_code,`=subinstr(`"`agglvl'"',`" "',`","',.)')"';
else if `"`classification'"'==`"naics"' {;
	if `"`state'"'==`"state"' local keep `"keep if inrange(agglvl_code,50,64)"';
	else if `"`national'"'==`"national"' {;
		local fips "US000";
		local keep `"keep if inrange(agglvl_code,10,28)"';
	};
};
else if `"`classification'"'==`"sic"' {;
	if `"`state'"'==`"state"' local keep `"keep if inrange(agglvl_code,18,23)"';
	else if `"`national'"'==`"national"' local keep `"keep if inrange(agglvl_code,1,11)"';
};

if `"`fips'"'!=`""' & `year0'<2012 {;
	foreach code of local fips {;
		local code = substr("0000",1,5-length(`"`code'"')) + "`code'"; /*Add leading zero to make 5 characters in length*/
		local fips_list `"`fips_list',`"`code'"'"';
	};
	if `"`keep'"'==`""' local keep `"keep if inlist(area_fips `fips_list')"';
	else local keep `"`keep' & inlist(area_fips `fips_list')"';
};

if `"`industry'"'!=`""' & `year0'<2012 {;
	foreach i of local industry {;
		local industry_list `"`industry_list',`"`=regexr(`"`i'"',`"_"',`"-"')'"'"';
	};
	if `"`keep'"'==`""' local keep `"keep if inlist(industry_code `industry_list')"';
	else local keep `"`keep' & inlist(industry_code `industry_list')"';
};	
	
if `"`vintage'"'!=`""' local vintageprefix `"https://web.archive.org/web/`vintage'/"';
if inlist(`"`frequency'"',`""',`"quarterly"') local frequency qtrly;


/*************************************************************************************************************
1. Load flat files
*************************************************************************************************************/
forvalues ccyy = `year0'/`year1' {;
	di `ccyy';
	if `"`sizeclass'"'==`"sizeclass"' {;
		qui copy `vintageprefix'https://data.bls.gov/cew/data/files/`ccyy'/csv/`ccyy'_q1_by_size.zip `ccyy'_q1_by_size.zip, replace;
		qui unzipfile `ccyy'_q1_by_size.zip, replace;
		qui import delimited using `ccyy'.q1.by_size.csv, clear;
		erase `ccyy'.q1.by_size.csv;
		erase `ccyy'_q1_by_size.zip;
	};
	else if `"`fips'"'!=`""' & `year0'>=year(date("$S_DATE","DMY"),"%tdCCYYNNDD")-5 {; /*Single area files available for past five years for much faster download*/
		foreach f of local fips {;
			forvalues q = 1/4 {;
				if `ccyy'==`latestyear' & `q'>`latestquarter' continue;
				qui import delimited using https://data.bls.gov/cew/data/api/`ccyy'/`q'/area/`f'.csv, clear;
				if `q'==1 & `"`f'"'==`"`: word 1 of `fips''"' tempfile dataccyy;
				else qui append using `dataccyy';
				qui save `dataccyy', replace;
			};
		};
	};
	else if `"`industry'"'!=`""' & `year0'>=year(date("$S_DATE","DMY"),"%tdCCYYNNDD")-5 {; /*Single industry files available for past five years for much faster download*/
		foreach i of local industry {;
			forvalues q = 1/4 {;
				if `ccyy'==`latestyear' & `q'>`latestquarter' continue;
				qui import delimited using https://data.bls.gov/cew/data/api/`ccyy'/`q'/industry/`i'.csv, clear;
				cap tostring industry_code, replace;
				if `q'==1 & `"`i'"'==`"`: word 1 of `industry''"' tempfile dataccyy;
				else qui append using `dataccyy';
				qui save `dataccyy', replace;
			};
		};
	};			
	else if `"`classification'"'==`"naics"' {;
		if `"`newrelease'"'==`"newrelease"' & `ccyy'==`latestyear' qui copy https://data.bls.gov/web/cewqtr/curr_yr_qtrly_singlefile.zip `ccyy'_qtrly_singlefile.zip, replace;
		else if `"`newrelease'"'==`"newrelease"' & `ccyy'==`latestyear'-1 qui copy https://data.bls.gov/web/cewqtr/prior_yr_qtrly_singlefile.zip `ccyy'_qtrly_singlefile.zip, replace;
		else qui copy `vintageprefix'https://data.bls.gov/cew/data/files/`ccyy'/csv/`ccyy'_`frequency'_singlefile.zip `ccyy'_`frequency'_singlefile.zip, replace;
		qui unzipfile `ccyy'_`frequency'_singlefile, replace;
		if `latestquarter'<4 & `ccyy'==`latestyear' local periods q1-q`latestquarter';
		else if `"`frequency'"'==`"qtrly"' local periods q1-q4;
		else if `"`frequency'"'==`"annual"' local periods annual;
		qui import delimited using `ccyy'.`periods'.singlefile.csv, clear;
		erase `ccyy'.`periods'.singlefile.csv;
		erase `ccyy'_`frequency'_singlefile.zip;
	};
	else if `"`classification'"'==`"sic"' {;
		qui copy `vintageprefix'https://data.bls.gov/cew/data/files/`ccyy'/sic/csv/sic_`ccyy'_`frequency'_singlefile.zip `ccyy'_`frequency'_singlefile.zip, replace;
		qui unzipfile `ccyy'_`frequency'_singlefile, replace;
		if `"`frequency'"'==`"qtrly"' local periods q1-q4;
		else if `"`frequency'"'==`"annual"' local periods annual;
		qui import delimited using sic.`ccyy'.`periods'.singlefile.csv, clear;
		erase sic.`ccyy'.`periods'.singlefile.csv;
		erase `ccyy'_`frequency'_singlefile.zip;
	};
	
	qui `keep'; /*Only keep selected data*/
	cap tostring disclosure_code, replace;
	cap tostring lq_disclosure_code, replace;

	if `"`first'"'==`""' {;
		tempfile data;
		qui save `data';
		local first first;
	};
	else {;
		qui append using `data';
		qui save `data', replace;
	};
};

tempfile data;
if `"`frequency'"'==`"qtrly"' {;
	gen int quarterly = yq(year,qtr);
	format quarterly %tq;
};

/*************************************************************************************************************
2. Documentation files
*************************************************************************************************************/
foreach var in agglevel industry ownership area {;
	preserve;
	if `"`classification'"'==`"naics"' {;
		qui import delimited using https://data.bls.gov/cew/doc/titles/`var'/`var'_titles.csv, varnames(1) clear;
	};
	else if `"`classification'"'==`"sic"' {;
		qui import delimited using https://data.bls.gov/cew/doc/titles/`var'/sic_`var'_titles.csv, varnames(1) clear;
	};
	tempfile `var';
	qui save ``var'';
	restore;
	if "`var'"=="ownership" & `"`classification'"'==`"naics"' {;
		merge m:1 own_code using ``var'', keep(master matched) nogenerate;
	};
	else if "`var'"=="ownership" & `"`classification'"'==`"sic"' {;
		qui gen ownership_code = own_code;
		merge m:1 ownership_code using ``var'', keep(master matched) nogenerate;
	};
	else if "`var'"=="area" {;
		merge m:1 area_fips using ``var'', keep(master matched) nogenerate;
	};
	else if "`var'"=="agglevel" {;
		merge m:1 agglvl_code using ``var'', keep(master matched) nogenerate;
	};	
	else if `"`classification'"'==`"naics"' {;
		merge m:1 `var'_code using ``var'', keep(master matched) nogenerate;
	};
	else {;
		qui gen csv_`var'_code = `var'_code;
		merge m:1 csv_`var'_code using ``var'', keep(master matched) nogenerate;
	};
	qui save `data', replace;
};

/*************************************************************************************************************
3. Save
*************************************************************************************************************/
sort area_fips year qtr industry_code own_code;
if `"`frequency'"'==`"qtrly"' order area_title quarterly industry_title own*_title month* total_qtrly;
else order area_title year industry_title own*_title annual_avg_emplvl total_annual_wages;
compress;
desc;

if `"`saving'"'!=`""' {;
	local filename = regexr(`"`saving'"',`"^.*/"',`""');
	local filename = regexr(`"`filename'"',`"^.*\\"',`""');
	qui save `"`filename'"', replace;
	qui zipfile `"`filename'.dta"', saving(`"`saving'"', replace);
	erase `"`filename'.dta"';
};


end;

#delimit cr 
